Please do not run the code in this section (Chapter 1), since it is meant to be run as a standalone script on zeno server in order to:
tweepyPRAW which stands for Python Reddit API WrapperThe Class MyStreamListener was inspired by https://gist.github.com/hugobowne/18f1c0c0709ed1a52dc5bcd462ac69f4
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
import time
import tweepy
import json
engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()
consumer_key = "Bcc9APjXSl4eH5nqJq6AqKQo0"
consumer_key_secret = "sT9VcZYL1yCnqnDzXETrU9nT09qdbmntmskq011cp2W9o8iHHh"
access_token = "988817670765273090-bKzdX7q9MOBs0ZXx6eclHxRmEWr4UCP"
access_token_secret = "9ioSZK6DWiDUi7y7y8aR2Rsr3jJUWyg2tnamrxK8cCfpx"
auth = tweepy.OAuthHandler(consumer_key, consumer_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
counter = 0
while True: # never ending loop
counter += 1
print("start iteration", counter)
## Stream tweets
class MyStreamListener(tweepy.StreamListener):
tweets_data = []
def __init__(self, api=None): # initialize the class, always uses self keyword
super(MyStreamListener, self).__init__() # super keyword makes all methods from class tweepy.StreeamListener
# available to my extended class MyStreamListener
# .__init__() runs the init method from tweepy.StreamListener
self.num_tweets = 0
def on_status(self, status):
tweet = status._json
self.tweets_data.append(tweet)
self.num_tweets += 1
if self.num_tweets < 2000: # get me 2000 tweets,
# later clean them and save them to Postgres DB, then sleep for 2 minutes and start over
return True
else:
return False
def on_error(self, status):
print(status)
# Initialize Stream listener and run all methods within
l = MyStreamListener()
# Create stream object with authentication
stream = tweepy.Stream(auth, l)
# Filter Twitter Streams to capture data by the keywords:
stream.filter(track = ['Bitcoin', 'bitcoin', 'BitCoin', 'BTC', 'bitCoin',
'BitcoinClassic', 'bitcoinclassic', 'bitcoinClassic', 'XBT',
'Ether', 'ether', 'ethereum', 'Ethereum', 'ETH', 'ETC',
'Ethereum Classic', 'EthereumClassic'])
tweets_data = MyStreamListener().tweets_data
## CLEAN tweets_data
text = [] # text of the tweet
for index, value in enumerate(tweets_data):
text.append(tweets_data[index]["text"])
tweet_created = [] # UTC time when this Tweet was created.
for index, value in enumerate(tweets_data):
tweet_created.append(tweets_data[index]["created_at"])
profile_created = [] # UTC datetime that the user account was created on Twitter.
for index, value in enumerate(tweets_data):
profile_created.append(tweets_data[index]["user"]["created_at"])
user_name = [] # The name of the user, as they’ve defined it. Not necessarily a person’s name.
for index, value in enumerate(tweets_data):
user_name.append(tweets_data[index]["user"]["name"])
user_screen_name = [] # The name of the user, as they’ve defined it. Not necessarily a person’s name.
for index, value in enumerate(tweets_data):
user_screen_name.append(tweets_data[index]["user"]["screen_name"])
followers_count = []
for index, value in enumerate(tweets_data):
followers_count.append(tweets_data[index]["user"]["followers_count"])
friends_count = []
for index, value in enumerate(tweets_data):
friends_count.append(tweets_data[index]["user"]["friends_count"])
time_zone = [] # deprecated, but partially filled
for index, value in enumerate(tweets_data):
time_zone.append(tweets_data[index]["user"]["time_zone"])
total_tweets = [] # The number of tweets_data (including retweets_data) issued by the user.
for index, value in enumerate(tweets_data):
total_tweets.append(tweets_data[index]["user"]["statuses_count"])
user_location = [] # The user-defined location for this account’s profile.
for index, value in enumerate(tweets_data):
user_location.append(tweets_data[index]["user"]["location"])
acc_descr = [] # The user-defined UTF-8 string describing their account.
for index, value in enumerate(tweets_data):
acc_descr.append(tweets_data[index]["user"]["description"])
verified = [] # blue verified badge on Twitter lets people know that it's an account of public interest and that it is authentic.
for index, value in enumerate(tweets_data):
verified.append(tweets_data[index]["user"]["verified"])
lang = [] # language
for index, value in enumerate(tweets_data):
lang.append(tweets_data[index]["lang"])
retweet_count = [] # Number of times this Tweet has been retweeted.
for index, value in enumerate(tweets_data):
retweet_count.append(tweets_data[index]["retweet_count"])
retweeter_requoter = [] # follower
for index, value in enumerate(tweets_data):
retweeter_requoter.append(tweets_data[index]["user"]["screen_name"])
influencer = [] # influencer = followee
for index, value in enumerate(tweets_data):
try:
influencer.append(tweets_data[index]["retweeted_status"]["user"]["screen_name"])
except:
influencer.append("not retweeted post")
influencer_quoted = [] # influencer = followee
for index, value in enumerate(tweets_data):
try:
influencer_quoted.append(tweets_data[index]["quoted_status"]["user"]["screen_name"])
except:
influencer_quoted.append("not quoted post")
full_text = [] # text of the tweet
for index, value in enumerate(tweets_data):
try:
full_text.append(tweets_data[index]["extended_tweet"]["full_text"])
except:
full_text.append("not extended tweet")
twitter_df = pd.DataFrame({"text":text, "tweet_created":tweet_created, "full_text":full_text,
"profile_created":profile_created, "user_name":user_name, "followers_count":followers_count,
"friends_count":friends_count, "time_zone":time_zone, "total_tweets":total_tweets,
"user_location":user_location, "acc_descr":acc_descr, "verified":verified, "lang":lang,
"retweet_count":retweet_count, "retweeter_requoter":retweeter_requoter,
"influencer":influencer, "influencer_quoted":influencer_quoted, "user_screen_name":user_screen_name})
twitter_df = twitter_df[twitter_df["lang"] == "en"] # exclude tweets_data in other languages
twitter_df.drop(["lang"], inplace=True, axis=1)
twitter_df["tweet_created"] = pd.to_datetime(twitter_df["tweet_created"], format = "%a %b %d %H:%M:%S +%f %Y")
twitter_df["profile_created"] = pd.to_datetime(twitter_df["profile_created"], format = "%a %b %d %H:%M:%S +%f %Y")
twitter_df.to_sql('twitter3', con=engine, if_exists='append', index=False,
dtype={"acc_descr":String(1000),"followers_count":Integer(), "friends_count":Integer(),
"influencer":String(100), "influencer_quoted":String(100), "profile_created":DateTime(),
"retweet_count":Integer(), "retweeter_requoter":String(100), "text":String(10000),
"time_zone":String(100), "total_tweets":Integer(), "tweet_created":DateTime(),
"user_location":String(1000), "user_name":String(1000), "verified":Boolean(),
"full_text":String(10000), "user_screen_name":String(1000)})
print(counter, "iteration is completed")
time.sleep(900)
import pandas as pd
import praw # import Python Reddit API Wrapper
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
import time
#import tweepy
#_____________________________________________________________________________________________________
# Stream the data from reddit and show the sentiment over time + sentiment comparison bw. BTC and ETH
#_____________________________________________________________________________________________________
client_id = "2BKbi2rOzcWy5w"
secret = "GOa9xfMkea62qn6U7yHdVonrF-g"
reddit = praw.Reddit(client_id = client_id, client_secret = secret, password = "************",
user_agent='praw_test', username='***********') # Reddit instance
engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()
import re
from textblob import TextBlob
def get_sentiment(sentence):
analysis = TextBlob(sentence)
return(analysis.sentiment.polarity) # > 0 positive, < 0 negative
def get_grouped_sentiment(sentence):
'''function to classify sentiment of passed SA score'''
analysis = TextBlob(sentence)
score = analysis.sentiment.polarity
if score > 0.05:
return 'positive'
elif score <= 0.05 and score > -0.005:
return 'neutral'
else:
return 'negative'
print("start")
counter = 0
while True:
counter += 1
print("start iteration", counter)
list_of_items = []
fields = ('title', 'created_utc', 'num_comments', 'ups', 'downs')
# fields attribute has the relevant "key" names that we want from each submission
# we deliberately dispose of selftext (long form), comments and url for consistency bw. reddits,
# since only some of them have selftext and the comments are nested, making their storage in a single table impossible
for submission in reddit.subreddit('CryptoCurrency').new(limit=None):
to_dict = vars(submission) # vars returns a dictionary with the attributes of the object.
sub_dict = {field:to_dict[field] for field in fields} # we grab specific values such as title, ups/downs from the dictionary
list_of_items.append(sub_dict) # to_dict is just a variable that is the dictionary form of each submission
## clean the reddits
reddit_df = pd.DataFrame(list_of_items)
reddit_df["created_utc"] = pd.to_datetime(reddit_df["created_utc"], unit='s') # clean the date format: up to seconds, without miliseconds
#engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
#connection = engine.connect()
# Regexp: split df into BTC & ETH
titles = reddit_df.title.tolist()
btc_titles = [i for i in titles if \
len(re.findall(r"(bitcoin|Bitcoin|BTC|BitCoin|bitCoin|BitcoinClassic|Bitcoin Classic|bitcoinclassic|bitcoinClassic|XBT)", i)) > 0]
eth_titles = [i for i in titles if \
len(re.findall(r"(ethereum|Ethereum|ETH|ETC|Ethereum Classic|EthereumClassic|ether|eth)", i)) > 0]
btc_SA_score = [get_sentiment(sentence) for sentence in btc_titles]
btc_SA_score_grouped = [get_grouped_sentiment(sentence) for sentence in btc_titles]
btc_reddit = pd.DataFrame({"title":btc_titles, "SA_score":btc_SA_score, "SA_score_grouped":btc_SA_score_grouped})
btc_reddit = pd.merge(btc_reddit, reddit_df, on="title", how="inner")
eth_SA_score = [get_sentiment(sentence) for sentence in eth_titles]
eth_SA_score_grouped = [get_grouped_sentiment(sentence) for sentence in eth_titles]
eth_reddit = pd.DataFrame({"title":eth_titles, "SA_score":eth_SA_score, "SA_score_grouped":eth_SA_score_grouped})
eth_reddit = pd.merge(eth_reddit, reddit_df, on="title", how="inner")
# replace btc_df with btc_reddit in all scripts
btc_reddit.to_sql('btc_reddit', con=engine, if_exists='append', index=False, \
dtype={"created_utc": DateTime(), "downs":Integer(), "ups":Integer(), \
"num_comments":Integer(), "title":String(10000), "SA_score":Float(), "SA_score_grouped":String(20)})
eth_reddit.to_sql('eth_reddit', con=engine, if_exists='append', index=False, \
dtype={"created_utc": DateTime(), "downs":Integer(), "ups":Integer(), \
"num_comments":Integer(), "title":String(10000), "SA_score":Float(), "SA_score_grouped":String(20)})
print(counter, " iteration is completed.")
time.sleep(600)
The following script produces an HTML file that runs on localhost by using Flask server on port 8050: http://127.0.0.1:8050/
requirements.txt and install all above mentioned libraries by using the command line Anaconda Prompt:cd to the directory where you stored the file requirements.txtpip install -r requirements.txtpython Cryptocurrency_Dashboard_Disruptive.pyhttp://127.0.0.1:8050/import dash
import dash_auth
import dash_core_components as dcc
import dash_html_components as html
import dash_table_experiments as dt
import plotly.graph_objs as go
import re
import numpy as np
import pandas as pd
import requests
import json
from textblob import TextBlob
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
# from dashapp import server as application
app = dash.Dash(__name__)
# if you want to implement USER and PASSWORD, please uncomment the following 3 lines of code:
# from dash.dependencies import Input, Output
# USERNAME_PASSWORD_PAIRS = [['bipm', 'crypto']]
# auth = dash_auth.BasicAuth(app,USERNAME_PASSWORD_PAIRS)
#_____________________________________________________________________________________________________
# Get the streamed data from REDDIT and show the sentiment over time + aggregated sentiment + comparison bw. BTC and ETH
#_____________________________________________________________________________________________________
# REDDIT preprocessing
#_____________________________________________________________________________________________________
engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()
btc_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\", \"SA_score\" from btc_reddit order by created_utc desc", con = connection, index_col=None)
eth_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\", \"SA_score\" from eth_reddit order by created_utc desc", con = connection, index_col=None)
# to later display the interactive data table
reddit = pd.concat([btc_reddit, eth_reddit], axis=0, join='outer', # to get UNION of rows, instead of intersection
join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
reddit.SA_score = round(reddit.SA_score,2)
# Aggregated sentiment on Reddit
btc_grouped = btc_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
btc_grouped["sentiment"] = btc_grouped.index
btc_grouped.reset_index(drop=True, inplace=True)
btc_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)
eth_grouped = eth_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
eth_grouped["sentiment"] = eth_grouped.index
eth_grouped.reset_index(drop=True, inplace=True)
eth_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)
#_____________________________________________________________________________________________________
# BTC and ETH values over time - Preprocessing
#_____________________________________________________________________________________________________
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=365'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_values_df = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') # clean the date format: up to seconds, without miliseconds
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=365'
r = requests.get(url)
json_data = r.json() # Decode the JSON data into a dictionary: json_data
eth_values_df = pd.DataFrame(json_data["Data"])
eth_values_df["timestamp"] = pd.to_datetime(eth_values_df["time"], unit='s') # clean the date format: converts the unix timestamp to pandas date data type
#_____________________________________________________________________________________________________
# Get the streamed data from TWITTER and show the sentiment over time + aggregated sentiment + comparison bw. BTC and ETH
#_____________________________________________________________________________________________________
# Twitter preprocessing
#_____________________________________________________________________________________________________
twitter_df = pd.read_sql(sql = "select distinct text, tweet_created from twitter3 TABLESAMPLE SYSTEM(1) where text ~* '(btc|#eth|ether|bitcoin|ethereum)' order by tweet_created desc", con = connection, index_col=None)
list_of_tweets = twitter_df.text.tolist()
# even though the data has been cleaned directly throgh SQL query, we use RegExp to separate tweets related to BTC and ETH
eth_tweets = [tweet for tweet in list_of_tweets if \
len(re.findall(r"(ethereum|Ethereum|ETH|ETC|Ethereum Classic|EthereumClassic|ether|eth)", tweet)) > 0]
btc_tweets = [tweet for tweet in list_of_tweets if \
len(re.findall(r"(bitcoin|Bitcoin|BTC|BitCoin|bitCoin|BitcoinClassic|Bitcoin Classic|bitcoinclassic|bitcoinClassic|XBT)", tweet)) > 0]
# Twitter Sentiment analysis
def get_sentiment(sentence):
analysis = TextBlob(sentence)
return(round(analysis.sentiment.polarity, 2)) # > 0 positive, < 0 negative
btc_twitter_sa = [get_sentiment(sentence) for sentence in btc_tweets]
twitter_btc_df = pd.DataFrame({"text":btc_tweets, "SA_score":btc_twitter_sa})
twitter_btc_df = pd.merge(twitter_btc_df, twitter_df, how='inner', on="text")
eth_twitter_sa = [get_sentiment(sentence) for sentence in eth_tweets]
twitter_eth_df = pd.DataFrame({"text":eth_tweets, "SA_score":eth_twitter_sa})
twitter_eth_df = pd.merge(twitter_eth_df, twitter_df, how="inner", on="text")
# to later display the interactive data table
twitter = pd.concat([twitter_btc_df, twitter_eth_df], axis=0, join='outer', # to get UNION of rows, instead of intersection
join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
# Remove duplicates for detailed sentiment plot
twitter_btc_df2 = twitter_btc_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
twitter_eth_df2 = twitter_eth_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
# Function for aggregated sentiment
def get_short_sentiment(sentence):
'''function to classify sentiment of passed SA score'''
if sentence > 0.05:
return 'positive'
elif sentence <= 0.05 and sentence > -0.005:
return 'neutral'
else:
return 'negative'
short_twitter_btc = [get_short_sentiment(t) for t in twitter_btc_df.SA_score] # twitter_btc
short_twitter_eth = [get_short_sentiment(t) for t in twitter_eth_df.SA_score] # twitter_eth
twitter_btc_grouped = pd.DataFrame({"nr_of_tweets":twitter_btc_df.text, "short":short_twitter_btc}).groupby("short")
twitter_eth_grouped = pd.DataFrame({"nr_of_tweets":twitter_eth_df.text, "short":short_twitter_eth}).groupby("short")
twitter_btc_grouped = twitter_btc_grouped.count()
twitter_eth_grouped = twitter_eth_grouped.count()
twitter_btc_grouped["sentiment"] = twitter_btc_grouped.index
twitter_eth_grouped["sentiment"] = twitter_eth_grouped.index
twitter_btc_grouped.reset_index(drop=True, inplace=True)
twitter_eth_grouped.reset_index(drop=True, inplace=True)
#______________________________________________________________________________
# News preprocessing
#_____________________________________________________________________________________________________
ccn = pd.read_sql(sql = "select distinct article, date from ccn_articles order by date desc",
con = connection, index_col=None)
ccn_sa = [get_sentiment(sentence) for sentence in ccn.article]
ccn_sa_df = pd.DataFrame({"article":ccn.article, "SA_score":ccn_sa})
ccn_df = pd.merge(ccn_sa_df, ccn, how='inner', on="article")
short_ccn = [get_short_sentiment(t) for t in ccn_df.SA_score] # twitter_btc
ccn_grouped = pd.DataFrame({"nr_of_articles":ccn_df.article, "short":short_ccn}).groupby("short").count()
ccn_grouped["sentiment"] = ccn_grouped.index
ccn_grouped.reset_index(drop=True, inplace=True)
#______________________________________________________________________________
# Simple BoW model
#______________________________________________________________________________
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
n = 10
def generate_word_list(text_col, nr_words = n):
tokens = word_tokenize(text_col.to_string()) # tokenize
lower_tokens = [t.lower() for t in tokens] # Convert the tokens into lowercase: lower_tokens
alpha_only = [t for t in lower_tokens if t.isalpha()] # Retain alphabetic words: alpha_only
stopwords = nltk.corpus.stopwords.words('english') # Remove all stop words: no_stops
newStopWords = ["rt", "bitcoin", "crypto", "cryptocurrency", "blockchain", "blockcha", "btc", "bitcoi", "bitcoins", "daily", "say", "could",
"price", "ethereum", "eth", "classic", "exchange", "market", "cryptocurrencie", "one", "first", "short", "check",
"cryptocurrencies", "http", "htttp", "hour", "list", "u", "new", "vi", "ccn", "etc", "usd"]
stopwords.extend(newStopWords)
no_stops = [t for t in alpha_only if t not in stopwords]
wordnet_lemmatizer = WordNetLemmatizer() # create instance of the WordNetLemmatizer class
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops if len(t)>1] # Lemmatize all tokens into a new list
lemmatized = [t for t in lemmatized if t not in stopwords] # remove stopwords again after lemmatization
bow = Counter(lemmatized) # Create the bag-of-words: bow
word = []
word_count = []
for i in range(nr_words):
word.append(bow.most_common(nr_words)[i][0])
word_count.append(bow.most_common(nr_words)[i][1])
words_and_counts_df = pd.DataFrame({"word":word, "word_count":word_count})
return(words_and_counts_df) # return the n most common tokens
#______________________________________________________________________________
# Aggregate Sentiment by day
#______________________________________________________________________________
# Reddit
minDate = btc_reddit["created_utc"].min()
maxDate = btc_reddit["created_utc"].max()
ts_btc_reddit = btc_reddit.set_index("created_utc", inplace=False)
ts_btc_reddit = ts_btc_reddit.SA_score.resample('D').mean()
ts_eth_reddit = eth_reddit.set_index("created_utc", inplace=False)
ts_eth_reddit = ts_eth_reddit.SA_score.resample('D').mean()
standardized_reddit_scores = pd.DataFrame({'BTC':ts_btc_reddit,'ETH':ts_eth_reddit})
# Since the server might be down on certain days, we need to ensure that our time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_reddit_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_reddit_scores['ETH'].interpolate(method='linear', inplace=True)
# Twitter
ts_twitter_btc_df = twitter_btc_df.set_index("tweet_created", inplace=False)
ts_twitter_btc_df = ts_twitter_btc_df.SA_score.resample('D').mean()
ts_twitter_eth_df = twitter_eth_df.set_index("tweet_created", inplace=False)
ts_twitter_eth_df = ts_twitter_eth_df.SA_score.resample('D').mean()
standardized_twitter_scores = pd.DataFrame({'BTC':ts_twitter_btc_df, 'ETH':ts_twitter_eth_df})
# Since the server might be down on certain days, we need to ensure that time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_twitter_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_twitter_scores['ETH'].interpolate(method='linear', inplace=True)
# News
ts_ccn = ccn_df.set_index("date", inplace=False)
ts_ccn = ts_ccn.SA_score.resample('D').mean()
standardized_ccn_scores = pd.DataFrame({'CCN':ts_ccn})
#______________________________________________________________________________
# Preprocessing for aggregated plot:
#______________________________________________________________________________
# BTC
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=20' # + str(len(standardized_reddit_scores))
# we want only last few days--> &toTs=1522224000
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_mini["timestamp"] = pd.to_datetime(btc_mini["time"], unit='s')
# ETH
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=20' # we want only last few days
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
eth_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
eth_mini["timestamp"] = pd.to_datetime(eth_mini["time"], unit='s')
btc_mini.set_index("timestamp", inplace=True)
eth_mini.set_index("timestamp", inplace=True)
# Now we scale the "Mini" BTC/ETH values so that we can plot them together with sentiment on the same axis.
# we scale values to be between -1 and 1, i.e. on the same scale as the sentiment values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
btc_scaled = pd.DataFrame(scaler.fit_transform(btc_mini), columns=btc_mini.columns)
eth_scaled = pd.DataFrame(scaler.fit_transform(eth_mini), columns=eth_mini.columns)
#______________________________________________________________________________
### Preprocessing for the map
#______________________________________________________________________________
#get the store name that can support crytocurrency
mapbox_access_token = 'pk.eyJ1Ijoic2FyYXB1dHJpIiwiYSI6ImNqaTMzNDBuaTB2djgzdm9hZXlnMTl1cW4ifQ.8vsxIGidl6bUz-u_rK3YSQ'
url = 'https://coinmap.org/api/v1/venues/'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
venues_df = pd.DataFrame(json_data["venues"]) # dictionary of currency values is stored as a list under the key "Data"
#btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') # clean the date format: up to seconds, without miliseconds
#venues_df.info()
site_lat = venues_df["lat"]
site_lon = venues_df["lon"]
locations_name = venues_df["name"]
category_name = venues_df["category"]
#get the address data from google maps from latitude and longitude
#______________________________________________________________________________
# Preprocessing for the NER = Named Entity Recognition
#______________________________________________________________________________
import pickle
with open('Pickle_dash/ccn_df_tagged.pkl', 'rb') as f:
ccn_df_tagged = pickle.load(f)
with open('Pickle_dash/df_agg_count_ccn.pkl', 'rb') as f:
df_agg_count_ccn = pickle.load(f)
with open('Pickle_dash/df_btc_reddit_org_agg.pkl', 'rb') as f:
df_btc_reddit_org_agg = pickle.load(f)
with open('Pickle_dash/df_btc_reddit_pep_agg.pkl', 'rb') as f:
df_btc_reddit_pep_agg = pickle.load(f)
with open('Pickle_dash/df_tweet_agg_sum.pkl', 'rb') as f:
df_tweet_agg_sum = pickle.load(f)
with open('Pickle_dash/df_tweet_btc_agg_sum.pkl', 'rb') as f:
df_tweet_btc_agg_sum = pickle.load(f)
with open('Pickle_dash/df_tweet_eth_agg_sum.pkl', 'rb') as f:
df_tweet_eth_agg_sum = pickle.load(f)
with open('Pickle_dash/df_tweet_pep_agg.pkl', 'rb') as f:
df_tweet_pep_agg = pickle.load(f)
with open('Pickle_dash/df_tweet_pep_btc_agg.pkl', 'rb') as f:
df_tweet_pep_btc_agg = pickle.load(f)
with open('Pickle_dash/df_tweet_pep_eth_agg.pkl', 'rb') as f:
df_tweet_pep_eth_agg = pickle.load(f)
with open('Pickle_dash/df_eth_reddit_org_agg.pkl', 'rb') as f:
df_eth_reddit_org_agg = pickle.load(f)
with open('Pickle_dash/df_eth_reddit_pep_agg.pkl', 'rb') as f:
df_eth_reddit_pep_agg = pickle.load(f)
#_____________________________________________________________________________________________________
# Define the app layout incl. all plots
#_____________________________________________________________________________________________________
app.layout = html.Div([html.H1('This dashboard shows current trends about Bitcoin and Ethereum in order to help you to make an informed decision for your investment',
id='h1-element'),
html.H3("Is the sentiment in the News and Social Media connected to the price developments over time? Let's have a look!"),
dcc.Graph(id='barplot5',
figure = {'data':[
go.Scatter(
x = btc_mini.index,
y = btc_scaled.close,
name = "BTC in USD (scaled)",
visible=True,
marker=dict(color='#f2a900'),
mode = 'markers+lines'
),
go.Scatter(
x = eth_mini.index,
y = eth_scaled.close,
name = "ETH in USD (scaled)",
visible=True,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_reddit_scores.index,
y = standardized_reddit_scores.BTC,
line = dict(color = '#f2a900', dash = 'dot'),
name = "BTC Sentiment on Reddit",
visible=True,
#marker=dict(color='green'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_reddit_scores.index,
y = standardized_reddit_scores.ETH,
line = dict(color = '#4d4d4e', dash = 'dot'),
name = "ETH Sentiment on Reddit",
visible=True,
#marker=dict(color='blue'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_ccn_scores.index,
y = standardized_ccn_scores.CCN,
line = dict(color = 'green', dash = 'dash'),
name = "BTC and ETH Sentiment in the News",
visible=True,
mode = 'markers+lines'
),
go.Scatter(
x = standardized_twitter_scores.index,
y = standardized_twitter_scores.BTC,
line = dict(color = 'blue', dash = 'solid'),
name = "BTC Sentiment on Twitter",
visible=True,
#marker=dict(color='green'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_twitter_scores.index,
y = standardized_twitter_scores.ETH,
line = dict(color = 'purple', dash = 'solid'),
name = "ETH Sentiment on Twitter",
visible=True,
#marker=dict(color='blue'),
mode = 'markers+lines'
)],
'layout':go.Layout(title = 'BTC and ETH values & sentiment', showlegend=True,
updatemenus = list([
dict(active=-1, buttons=list([
dict(label = 'BTC and ETH Values over time',
method = 'update',
args = [{'visible': [True, True, False, False, False, False, False]},
{'title': 'BTC and ETH values'}]),
dict(label = 'BTC and ETH Sentiment on Reddit',
method = 'update',
args = [{'visible': [False, False, True, True, False, False, False]},
{'title': 'BTC and ETH sentiment on Reddit'}]),
dict(label = 'News',
method = 'update',
args = [{'visible': [False, False, False, False, True, False, False]},
{'title': 'BTC and ETH sentiment in the News'}]),
dict(label = 'Twitter BTC & ETH',
method = 'update',
args = [{'visible': [False, False, False, False, False, True, True]},
{'title': 'BTC and ETH sentiment on Twitter'}]),
dict(label = 'Reset: show all',
method = 'update',
args = [{'visible': [True, True, True, True, True, True, True]},
{'title': 'BTC and ETH values & sentiment in the News and Social Media'}])
])
)
])
,
xaxis = dict(title = 'Time', range = [minDate, maxDate]),
yaxis = dict(title = 'Sentiment & Values over time')
)}),
html.P("In this dashboard, you can analyze the sentiment on social media and in the news regarding the two most popular cryptocurrencies: Bitcoin (BTC) and Ethereum (ETH).\n \
You can choose the source you are interested in by selecting from the dropdown-menu on the left. \
The sentiment score on the Y axis is a value between -1, denoting a strong negative sentiment, and 1, very positive sentiment."),
dcc.Graph(id='scatterplot1',
figure = {'data':[
go.Scatter(
x = btc_reddit.created_utc,
y = btc_reddit.SA_score,
name = "BTC Sentiment on Reddit",
visible=True,
marker=dict(color='#f2a900'),
mode = 'markers+lines'
),
go.Scatter(
x = eth_reddit.created_utc,
y = eth_reddit.SA_score,
name = "ETH Sentiment on Reddit",
visible=True,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
),
go.Scatter(
x = twitter_btc_df2.tweet_created,
y = twitter_btc_df2.SA_score,
name = "BTC Sentiment on Twitter",
visible=False,
marker=dict(color='#f2a900'),
mode = 'markers+lines'
),
go.Scatter(
x = twitter_eth_df2.tweet_created,
y = twitter_eth_df2.SA_score,
name = "ETH Sentiment on Twitter",
visible=False,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
),
go.Scatter(
x = ccn_df.date[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
y = ccn_df.SA_score[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
name = "BTC and ETH Sentiment in the News",
visible=False,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
)
],
'layout':go.Layout(title = 'BTC and ETH sentiment over time', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC Sentiment on Reddit',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': 'BTC sentiment over time on Reddit'}]),
dict(label = 'ETH Sentiment on Reddit',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': 'ETH sentiment over time on Reddit'}]),
dict(label = 'Both: Sentiment on Reddit',
method = 'update',
args = [{'visible': [True, True, False, False, False]},
{'title': 'BTC and ETH sentiment over time on Reddit'}]),
dict(label = 'BTC Sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': 'BTC sentiment over time on Twitter'}]),
dict(label = 'ETH Sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': 'ETH sentiment over time on Twitter'}]),
dict(label = 'Both: Sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, True, True, False]},
{'title': 'BTC and ETH sentiment over time on Twitter'}]),
dict(label = 'BTC & ETH Sentiment in the News',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': 'BTC and ETH Sentiment in the News'}])
]),
)
])
,
xaxis = dict(title = 'Time'), #, range = [minDate, maxDate]),
yaxis = dict(title = 'Sentiment')
)}
),
# Sentiment grouped
dcc.Graph(id='pie2',
figure = {'data':[
go.Pie(
labels=btc_grouped.sentiment,
values=btc_grouped.nr_of_tweets,
name = 'BTC Sentiment on Reddit',
visible=True,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686']) # set the colors to red, yellow and green for pie chart
),
go.Pie(
labels=eth_grouped.sentiment,
values=eth_grouped.nr_of_tweets,
name = 'ETH Sentiment on Reddit',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
),
go.Pie(
labels=twitter_btc_grouped.sentiment,
values=twitter_btc_grouped.nr_of_tweets,
name = 'BTC Sentiment on Twitter',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
),
go.Pie(
labels=twitter_eth_grouped.sentiment,
values=twitter_eth_grouped.nr_of_tweets,
name = 'ETH Sentiment on Twitter',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
),
go.Pie(
labels=ccn_grouped.sentiment,
values=ccn_grouped.nr_of_articles,
name = 'BTC and ETH Sentiment in the News',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
)
],
'layout':go.Layout(title = 'BTC sentiment on Reddit', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC sentiment on Reddit',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': 'BTC sentiment on Reddit'}]),
dict(label = 'ETH sentiment on Reddit',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': 'ETH sentiment on Reddit'}]),
dict(label = 'BTC sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': 'BTC sentiment on Twitter'}]),
dict(label = 'ETH sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': 'ETH sentiment on Twitter'}]),
dict(label = 'BTC & ETH Sentiment in the News',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': 'BTC and ETH Sentiment in the News'}])
]),
)
])
)}
),
html.H2("Where can we actually pay by using Bitcoin and Ethereum? Let's find out."),
dcc.Graph(id='map5',
figure = {'data':[
go.Scattermapbox(
lat=site_lat,
lon=site_lon,
mode='markers',
marker=dict(
size=9
),
text='store name: ' +locations_name+ ', ' + 'category: ' +category_name,
hoverinfo="text"
)
],
'layout':go.Layout(autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(lat=52.52,lon=13.4050),
pitch=0,
zoom=10
))}),
html.H2("Which organizations and people are currently mentioned in the News and Social Media with respect to Bitcoin and Ethereum?"),
dcc.Graph(id='organizations',
figure = {'data':[
go.Bar(
x = df_tweet_agg_sum['Number'].head(15).sort_index(ascending=False),
y = df_tweet_agg_sum['Organization'].head(15).sort_index(ascending=False),
name = "Twitter (BTC & ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_btc_agg_sum['Number'].head(15).sort_index(ascending=False),
y = df_tweet_btc_agg_sum['Organization'].head(15).sort_index(ascending=False),
name = "Twitter (BTC)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_eth_agg_sum['Number'].head(15).sort_index(ascending=False),
y = df_tweet_eth_agg_sum['Organization'].head(15).sort_index(ascending=False),
name = "Twitter (ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_btc_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
y = df_btc_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
name = "Reddit (BTC)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
go.Bar(
x = df_eth_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
y = df_eth_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
name = "Reddit (ETH)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
go.Bar(
x = df_agg_count_ccn['Sum'].head(15).sort_index(ascending=False),
y = df_agg_count_ccn['Organization'].head(15).sort_index(ascending=False),
name = "CCN news",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 118, 0)')
),
],
'layout':go.Layout(title = 'Organizations on Twitter, Reddit & CCN news', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'Twitter (BTC & ETH)',
method = 'update',
args = [{'visible': [True, False, False, False, False, False]},
{'title': 'TOP 15 organizations on Twitter (BTC & ETH)'}]),
dict(label = 'Twitter (BTC)',
method = 'update',
args = [{'visible': [False, True, False, False, False, False]},
{'title': 'TOP 15 organizations on Twitter (BTC)'}]),
dict(label = 'Twitter (ETH)',
method = 'update',
args = [{'visible': [False, False, True, False, False, False]},
{'title': 'TOP 15 organizations on Twitter (ETH)'}]),
dict(label = 'Reddit (BTC)',
method = 'update',
args = [{'visible': [False, False, False, True, False, False]},
{'title': 'TOP 5 organizations on Reddit (BTC)'}]),
dict(label = 'Reddit (ETH)',
method = 'update',
args = [{'visible': [False, False, False, False, True, False]},
{'title': 'TOP 5 organizations on Reddit (ETH)'}]),
dict(label = 'CCN news',
method = 'update',
args = [{'visible': [False, False, False, False, False, True]},
{'title': 'TOP 15 organizations on CCN news'}]),
dict(label = 'Reset: show all',
method = 'update',
args = [{'visible': [True, True, True, True, True, True]},
{'title': 'TOP 15 organizations on CCN news'}])
]),
direction = 'down',
pad = {'r': 10, 't': 10},
showactive = True,
x = 0,
xanchor = 'right',
y = 1.2,
yanchor = 'top'
)
])
,
xaxis = dict(title = 'Number of occurrences'),
yaxis = dict(title = ''),
margin=dict(
l=250,
r=20,
t=70,
b=70,
)
)}
),
dcc.Graph(id='people',
figure = {'data':[
go.Bar(
x = df_tweet_pep_agg['Number'].head(15).sort_index(ascending=False),
y = df_tweet_pep_agg['Person'].head(15).sort_index(ascending=False),
name = "Twitter (BTC & ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_pep_btc_agg['Number'].head(15).sort_index(ascending=False),
y = df_tweet_pep_btc_agg['Person'].head(15).sort_index(ascending=False),
name = "Twitter (BTC)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_pep_eth_agg['Number'].head(15).sort_index(ascending=False),
y = df_tweet_pep_eth_agg['Person'].head(15).sort_index(ascending=False),
name = "Twitter (ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_btc_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
y = df_btc_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
name = "Reddit (BTC)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
go.Bar(
x = df_eth_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
y = df_eth_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
name = "Reddit (ETH)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
],
'layout':go.Layout(title = 'TOP people discussed on Twitter & Reddit', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'Twitter (BTC & ETH)',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': 'TOP 15 people on Twitter (BTC & ETH)'}]),
dict(label = 'Twitter (BTC)',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': 'TOP 15 people on Twitter (BTC)'}]),
dict(label = 'Twitter (ETH)',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': 'TOP 15 people on Twitter (ETH)'}]),
dict(label = 'Reddit (BTC)',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': 'TOP 5 people on Reddit (BTC)'}]),
dict(label = 'Reddit (ETH)',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': 'TOP 5 people on Reddit (ETH)'}]),
dict(label = 'Reset: show all',
method = 'update',
args = [{'visible': [True, True, True, True, True]},
{'title': 'TOP people on CCN news'}])
]),
direction = 'down',
pad = {'r': 10, 't': 10},
showactive = True,
x = 0,
xanchor = 'right',
y = 1.2,
yanchor = 'top'
)
])
,
xaxis = dict(title = 'Number of occurrences'),
yaxis = dict(title = ''),
margin=dict(
l=250,
r=20,
t=70,
b=70,
)
)}
),
# BTC/ETH values over time
html.H3("You can also look at the recent development in the currency values. If you are interested in a specific time interval, \
you can zoom in by selecting the desired period. If you click at the small house icon, you can reset the axis again."),
dcc.Graph(id='scatterplot3',
figure = {'data':[
go.Scatter(
x = btc_values_df.timestamp,
y = btc_values_df.close,
name = 'BTC',
mode = 'markers+lines'
),
go.Scatter(
x = btc_values_df.timestamp,
y = [btc_values_df.close.mean()]*len(btc_values_df.timestamp),
name = 'BTC Average',
visible = False,
line=dict(color='#33CFA5', dash='dash')
),
go.Scatter(
x = eth_values_df.timestamp,
y = eth_values_df.close,
name = 'ETH',
mode = 'markers+lines'
),
go.Scatter(
x = eth_values_df.timestamp,
y = [eth_values_df.close.mean()]*len(eth_values_df.timestamp),
name = 'ETH Average',
visible = False,
line=dict(color='#33CFA5', dash='dash')
)
],
'layout':go.Layout(title = 'BTC and ETH values over time', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC',
method = 'update',
args = [{'visible': [True, True, False, False]},
{'title': 'BTC values over time',
'annotations': [
dict(x=btc_values_df.iloc[btc_values_df.close.idxmax()]["timestamp"],
y=btc_values_df.close.max(),
xref='x', yref='y',
text='Max value:<br>'+str(btc_values_df.close.max()),
ax=0, ay=-40),
dict(x='2017-09-01 00:00:00',
y=btc_values_df.close.mean(),
xref='x', yref='y',
text='Average value in the displayed time period:<br>'+str(round(btc_values_df.close.mean(), 2)),
ax=0, ay=-40)
]},
]),
dict(label = 'ETH',
method = 'update',
args = [{'visible': [False, False, True, True]},
{'title': 'ETH values over time',
'annotations': [
dict(x=eth_values_df.iloc[eth_values_df.close.idxmax()]["timestamp"],
y=eth_values_df.close.max(),
xref='x', yref='y',
text='Max value:<br>'+str(eth_values_df.close.max()),
ax=0, ay=-40),
dict(x='2017-09-01 00:00:00',
y=eth_values_df.close.mean(),
xref='x', yref='y',
text='Average value in the displayed time period:<br>'+str(round(eth_values_df.close.mean(), 2)),
ax=0, ay=-40)
]}]),
dict(label = 'Both',
method = 'update',
args = [{'visible': [True, False, True, False]},
{'title': 'BTC and ETH values over time',
'annotations': []}])
]),
)
]),
xaxis = {'title':'Time'},
yaxis = {'title':'Value (in USD)'}
)}
),
# BoW plot
html.H3("Additionally, you can see the most common words that are used in all discussions around Bitcoin and Ethereum on diverse channels. \
You can select the channel and the currency you are interested in from the dropdown menu on the left."),
dcc.Graph(id='barplot4',
figure = {'data':[
go.Bar(
x=generate_word_list(text_col= btc_reddit.title).word,
y=generate_word_list(text_col= btc_reddit.title).word_count,
name = 'BTC words on Reddit',
visible=True,
marker=dict(color='#f2a900') # set the marker color to gold
),
go.Bar(
x=generate_word_list(text_col = eth_reddit.title).word,
y=generate_word_list(text_col = eth_reddit.title).word_count,
name = 'ETH words on Reddit',
visible=True,
marker=dict(color='#4d4d4e') # set the marker color to silver
),
go.Bar(
x=generate_word_list(text_col = twitter_btc_df.text).word,
y=generate_word_list(text_col = twitter_btc_df.text).word_count,
name = 'BTC words on Twitter',
visible=False,
marker=dict(color='#f2a900') # set the marker color to gold
),
go.Bar(
x=generate_word_list(text_col = twitter_eth_df.text).word,
y=generate_word_list(text_col = twitter_eth_df.text).word_count,
name = 'ETH words on Twitter',
visible=False,
marker=dict(color='#4d4d4e') # set the marker color to silver
),
go.Bar(
x=generate_word_list(text_col = ccn_df.article).word,
y=generate_word_list(text_col = ccn_df.article).word_count,
name = 'Top words in Cryptocurrency News',
visible=False,
marker=dict(color='#f2a900') # set the marker color to gold
)
],
'layout':go.Layout(title = str(n) +' most common words currently used in Bitcoin/Ethereum discussions', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC words on Reddit',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': str(n) + ' most common words currently used about Bitcoin on Reddit'}]),
dict(label = 'ETH words on Reddit',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': str(n) + ' most common words currently used about Ethereum on Reddit'}]),
dict(label = 'Both Reddit',
method = 'update',
args = [{'visible': [True, True, False, False, False]},
{'title': str(n)+ ' most common words currently used about Bitcoin and Ethereum on Reddit'}]),
dict(label = 'BTC words on Twitter',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': str(n) + ' most common words currently used about Bitcoin on Twitter'}]),
dict(label = 'ETH words on Twitter',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': str(n) + ' most common words currently used about Ethereum on Twitter'}]),
dict(label = 'Both Twitter',
method = 'update',
args = [{'visible': [False, False, True, True, False]},
{'title': str(n) + ' most common words currently used about Bitcoin and Ethereum on Twitter'}]),
dict(label = 'Cryptocurrency News',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': str(n) + ' most common words currently used about Bitcoin and Ethereum in the News'}])
]),
)
])
,
xaxis = {'title':'Word'},
yaxis = {'title':'Word count'}
)}
),
# Interactive tables to inspect raw data
html.Div([
html.H2('Let\'s go more into detail: Reddit data'),
dt.DataTable(
rows = reddit.to_dict('records'),
filterable=True,
sortable=True
)]),
html.Div([
html.H2('Twitter data'),
dt.DataTable(
rows = twitter.to_dict('records'),
filterable=True,
sortable=True
)]),
html.Div([
html.H2('The News'),
dt.DataTable(
rows = ccn_df.to_dict('records'),
filterable=True,
sortable=True
)])
])
if __name__ == '__main__':
app.run_server()
import re
import numpy as np
import pandas as pd
from textblob import TextBlob
from sqlalchemy import create_engine
from sqlalchemy.types import String, Integer, Float, Boolean, DateTime
import requests
import json
engine = create_engine('postgresql://consultant:pgHWR2018@zeno.lehre.hwr-berlin.de:5432/disruptive')
connection = engine.connect()
btc_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\",\
\"SA_score\" from btc_reddit order by created_utc desc", con = connection, index_col=None)
eth_reddit = pd.read_sql(sql = "select distinct title, created_utc, \"SA_score_grouped\",\
\"SA_score\" from eth_reddit order by created_utc desc", con = connection, index_col=None)
btc_reddit.info()
eth_reddit.info()
# to later display the data table to filter and sort data for both BTC and ETH
reddit = pd.concat([btc_reddit, eth_reddit], axis=0, join='outer', # to get UNION of rows, instead of intersection
join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
reddit.info()
reddit.head()
reddit.SA_score = round(reddit.SA_score,2)
reddit.head()
# for later range in plots
minDate = btc_reddit["created_utc"].min()
maxDate = btc_reddit["created_utc"].max()
btc_grouped = btc_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
btc_grouped["sentiment"] = btc_grouped.index
btc_grouped.reset_index(drop=True, inplace=True)
btc_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)
btc_grouped
eth_grouped = eth_reddit[["SA_score", "SA_score_grouped"]].groupby("SA_score_grouped").count()
eth_grouped["sentiment"] = eth_grouped.index
eth_grouped.reset_index(drop=True, inplace=True)
eth_grouped.rename(columns={"SA_score": "nr_of_tweets"}, inplace=True)
eth_grouped
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=365'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_values_df = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s')
# clean the date format: up to seconds, without miliseconds
btc_values_df.tail()
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=365'
r = requests.get(url)
json_data = r.json() # Decode the JSON data into a dictionary: json_data
eth_values_df = pd.DataFrame(json_data["Data"])
eth_values_df["timestamp"] = pd.to_datetime(eth_values_df["time"], unit='s')
# clean the date format: converts the unix timestamp to pandas date data type
eth_values_df.tail()
twitter_df = pd.read_sql(sql = "select distinct text, tweet_created from twitter3 TABLESAMPLE SYSTEM(1) \
where text ~* '(btc|#eth|ether|bitcoin|ethereum)' order by tweet_created desc", con = connection, index_col=None)
twitter_df.info()
twitter_df.head(3)
twitter_df.tail(3)
list_of_tweets = twitter_df.text.tolist()
eth_tweets = [tweet for tweet in list_of_tweets if \
len(re.findall(r"(ethereum|Ethereum|ETH|ETC|Ethereum Classic|EthereumClassic|ether|eth)", tweet)) > 0]
btc_tweets = [tweet for tweet in list_of_tweets if \
len(re.findall(r"(bitcoin|Bitcoin|BTC|BitCoin|bitCoin|BitcoinClassic|Bitcoin Classic|bitcoinclassic|bitcoinClassic|XBT)", tweet)) > 0]
# Sentiment analysis
def get_sentiment(sentence):
analysis = TextBlob(sentence)
return(analysis.sentiment.polarity) # > 0 positive, < 0 negative
btc_twitter_sa = [get_sentiment(sentence) for sentence in btc_tweets]
twitter_btc_df = pd.DataFrame({"text":btc_tweets, "SA_score":btc_twitter_sa})
twitter_btc_df = pd.merge(twitter_btc_df, twitter_df, how='inner', on="text")
eth_twitter_sa = [get_sentiment(sentence) for sentence in eth_tweets]
twitter_eth_df = pd.DataFrame({"text":eth_tweets, "SA_score":eth_twitter_sa})
twitter_eth_df = pd.merge(twitter_eth_df, twitter_df, how="inner", on="text")
twitter_btc_df.head(3)
twitter_btc_df.info()
twitter_eth_df.head(3)
twitter_eth_df.info()
# to later display the data table to filter and sort data for both BTC and ETH
twitter = pd.concat([twitter_btc_df, twitter_eth_df], axis=0, join='outer', # to get UNION of rows, instead of intersection
join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
twitter_btc_df2 = twitter_btc_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
twitter_btc_df2.info()
twitter_eth_df2 = twitter_eth_df.drop_duplicates(subset = ["text"], keep='first', inplace=False)
twitter_eth_df2.info()
def get_short_sentiment(sentence):
'''function to classify sentiment of passed SA score'''
if sentence > 0.05:
return 'positive'
elif sentence <= 0.05 and sentence > -0.005:
return 'neutral'
else:
return 'negative'
short_twitter_btc = [get_short_sentiment(t) for t in twitter_btc_df.SA_score] # twitter_btc
short_twitter_eth = [get_short_sentiment(t) for t in twitter_eth_df.SA_score] # twitter_eth
twitter_btc_grouped = pd.DataFrame({"nr_of_tweets":twitter_btc_df.text, "short":short_twitter_btc}).groupby("short")
twitter_eth_grouped = pd.DataFrame({"nr_of_tweets":twitter_eth_df.text, "short":short_twitter_eth}).groupby("short")
twitter_btc_grouped = twitter_btc_grouped.count()
twitter_eth_grouped = twitter_eth_grouped.count()
twitter_btc_grouped["sentiment"] = twitter_btc_grouped.index
twitter_eth_grouped["sentiment"] = twitter_eth_grouped.index
twitter_btc_grouped.reset_index(drop=True, inplace=True)
twitter_eth_grouped.reset_index(drop=True, inplace=True)
twitter_btc_grouped
twitter_eth_grouped
ccn = pd.read_sql(sql = "select distinct article, date from ccn_articles order by date desc",
con = connection, index_col=None)
ccn.info()
ccn_sa = [get_sentiment(sentence) for sentence in ccn.article]
ccn_sa_df = pd.DataFrame({"article":ccn.article, "SA_score":ccn_sa})
ccn_df = pd.merge(ccn_sa_df, ccn, how='inner', on="article")
ccn_df.head(1)
short_ccn = [get_short_sentiment(t) for t in ccn_df.SA_score] # twitter_btc
ccn_grouped = pd.DataFrame({"nr_of_articles":ccn_df.article, "short":short_ccn}).groupby("short").count()
ccn_grouped["sentiment"] = ccn_grouped.index
ccn_grouped.reset_index(drop=True, inplace=True)
ccn_grouped
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
n = 10
def generate_word_list(text_col, nr_words = n):
tokens = word_tokenize(text_col.to_string()) # tokenize
lower_tokens = [t.lower() for t in tokens] # Convert the tokens into lowercase: lower_tokens
alpha_only = [t for t in lower_tokens if t.isalpha()] # Retain alphabetic words: alpha_only
stopwords = nltk.corpus.stopwords.words('english') # Remove all stop words: no_stops
newStopWords = ["rt", "bitcoin", "crypto", "cryptocurrency", "blockchain", "btc", "bitcoi", "bitcoins",
"price", "ethereum", "eth", "classic", "exchange", "market", "cryptocurrencie",
"cryptocurrencies", "http", "htttp", "hour", "list", "u"]
stopwords.extend(newStopWords)
no_stops = [t for t in alpha_only if t not in stopwords]
wordnet_lemmatizer = WordNetLemmatizer() # Instantiate the WordNetLemmatizer
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops if len(t)>1] # Lemmatize all tokens into a new list
bow = Counter(lemmatized) # Create the bag-of-words: bow
word = []
word_count = []
for i in range(nr_words):
word.append(bow.most_common(nr_words)[i][0])
word_count.append(bow.most_common(nr_words)[i][1])
words_and_counts_df = pd.DataFrame({"word":word, "word_count":word_count})
return(words_and_counts_df) # return the n most common tokens
generate_word_list(text_col = btc_reddit.title)
generate_word_list(text_col = eth_reddit.title)
generate_word_list(text_col = twitter_btc_df.text)
generate_word_list(text_col = twitter_eth_df.text)
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True) # connects javaScript to this notebook, since it connects plotly and pandas
# to interactive javaScript library for offline use
cf.go_offline()
%matplotlib inline
figure = {'data':[
go.Scatter(
x = btc_reddit.created_utc,
y = btc_reddit.SA_score,
name = "BTC Sentiment on Reddit",
visible=True,
marker=dict(color='#f2a900'),
mode = 'markers+lines'
),
go.Scatter(
x = eth_reddit.created_utc,
y = eth_reddit.SA_score,
name = "ETH Sentiment on Reddit",
visible=True,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
),
go.Scatter(
x = twitter_btc_df2.tweet_created,
y = twitter_btc_df2.SA_score,
name = "BTC Sentiment on Twitter",
visible=False,
marker=dict(color='#f2a900'),
mode = 'markers+lines'
),
go.Scatter(
x = twitter_eth_df2.tweet_created,
y = twitter_eth_df2.SA_score,
name = "ETH Sentiment on Twitter",
visible=False,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
),
go.Scatter(
x = ccn_df.date[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
y = ccn_df.SA_score[np.logical_and(ccn_df.date >= minDate, ccn_df.date <= maxDate)],
name = "BTC and ETH Sentiment in the News",
visible=False,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
)
],
'layout':go.Layout(title = 'BTC and ETH sentiment over time', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC Sentiment on Reddit',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': 'BTC sentiment over time on Reddit'}]),
dict(label = 'ETH Sentiment on Reddit',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': 'ETH sentiment over time on Reddit'}]),
dict(label = 'Both: Sentiment on Reddit',
method = 'update',
args = [{'visible': [True, True, False, False, False]},
{'title': 'BTC and ETH sentiment over time on Reddit'}]),
dict(label = 'BTC Sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': 'BTC sentiment over time on Twitter'}]),
dict(label = 'ETH Sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': 'ETH sentiment over time on Twitter'}]),
dict(label = 'Both: Sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, True, True, False]},
{'title': 'BTC and ETH sentiment over time on Twitter'}]),
dict(label = 'BTC & ETH Sentiment in the News',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': 'BTC and ETH Sentiment in the News'}])
]),
)
])
,
xaxis = dict(title = 'Time'), #, range = [minDate, maxDate]),
yaxis = dict(title = 'Sentiment')
)}
iplot(go.Figure(figure))
figure = {'data':[
go.Pie(
labels=btc_grouped.sentiment,
values=btc_grouped.nr_of_tweets,
name = 'BTC Sentiment on Reddit',
visible=True,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686']) # set the colors to red, yellow and green for pie chart
),
go.Pie(
labels=eth_grouped.sentiment,
values=eth_grouped.nr_of_tweets,
name = 'ETH Sentiment on Reddit',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
),
go.Pie(
labels=twitter_btc_grouped.sentiment,
values=twitter_btc_grouped.nr_of_tweets,
name = 'BTC Sentiment on Twitter',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
),
go.Pie(
labels=twitter_eth_grouped.sentiment,
values=twitter_eth_grouped.nr_of_tweets,
name = 'ETH Sentiment on Twitter',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
),
go.Pie(
labels=ccn_grouped.sentiment,
values=ccn_grouped.nr_of_articles,
name = 'BTC and ETH Sentiment in the News',
visible=False,
marker=dict(colors=['#fc586e', '#fffaaa', '#87d686'])
)
],
'layout':go.Layout(title = 'BTC sentiment on Reddit', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC sentiment on Reddit',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': 'BTC sentiment on Reddit'}]),
dict(label = 'ETH sentiment on Reddit',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': 'ETH sentiment on Reddit'}]),
dict(label = 'BTC sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': 'BTC sentiment on Twitter'}]),
dict(label = 'ETH sentiment on Twitter',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': 'ETH sentiment on Twitter'}]),
dict(label = 'BTC & ETH Sentiment in the News',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': 'BTC and ETH Sentiment in the News'}])
]),
)
])
)}
iplot(go.Figure(figure))
figure = {'data':[
go.Scatter(
x = btc_values_df.timestamp,
y = btc_values_df.close,
name = 'BTC',
mode = 'markers+lines'
),
go.Scatter(
x = btc_values_df.timestamp,
y = [btc_values_df.close.mean()]*len(btc_values_df.timestamp),
name = 'BTC Average',
visible = False,
line=dict(color='#33CFA5', dash='dash')
),
go.Scatter(
x = eth_values_df.timestamp,
y = eth_values_df.close,
name = 'ETH',
mode = 'markers+lines'
),
go.Scatter(
x = eth_values_df.timestamp,
y = [eth_values_df.close.mean()]*len(eth_values_df.timestamp),
name = 'ETH Average',
visible = False,
line=dict(color='#33CFA5', dash='dash')
)
],
'layout':go.Layout(title = 'BTC and ETH values over time', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC',
method = 'update',
args = [{'visible': [True, True, False, False]},
{'title': 'BTC values over time',
'annotations': [
dict(x=btc_values_df.iloc[btc_values_df.close.idxmax()]["timestamp"],
y=btc_values_df.close.max(),
xref='x', yref='y',
text='Max value:<br>'+str(btc_values_df.close.max()),
ax=0, ay=-40),
dict(x='2017-09-01 00:00:00',
y=btc_values_df.close.mean(),
xref='x', yref='y',
text='Average value in the displayed time period:<br>'+str(round(btc_values_df.close.mean(), 2)),
ax=0, ay=-40)
]},
]),
dict(label = 'ETH',
method = 'update',
args = [{'visible': [False, False, True, True]},
{'title': 'ETH values over time',
'annotations': [
dict(x=eth_values_df.iloc[eth_values_df.close.idxmax()]["timestamp"],
y=eth_values_df.close.max(),
xref='x', yref='y',
text='Max value:<br>'+str(eth_values_df.close.max()),
ax=0, ay=-40),
dict(x='2017-09-01 00:00:00',
y=eth_values_df.close.mean(),
xref='x', yref='y',
text='Average value in the displayed time period:<br>'+str(round(eth_values_df.close.mean(), 2)),
ax=0, ay=-40)
]}]),
dict(label = 'Both',
method = 'update',
args = [{'visible': [True, False, True, False]},
{'title': 'BTC and ETH values over time',
'annotations': []}])
]),
)
]),
xaxis = {'title':'Time'},
yaxis = {'title':'Value (in USD)'}
)}
iplot(go.Figure(figure))
figure = {'data':[
go.Bar(
x=generate_word_list(text_col= btc_reddit.title).word,
y=generate_word_list(text_col= btc_reddit.title).word_count,
name = 'BTC words on Reddit',
visible=True,
marker=dict(color='#f2a900') # set the marker color to gold
),
go.Bar(
x=generate_word_list(text_col = eth_reddit.title).word,
y=generate_word_list(text_col = eth_reddit.title).word_count,
name = 'ETH words on Reddit',
visible=True,
marker=dict(color='#4d4d4e') # set the marker color to silver
),
go.Bar(
x=generate_word_list(text_col = twitter_btc_df.text).word,
y=generate_word_list(text_col = twitter_btc_df.text).word_count,
name = 'BTC words on Twitter',
visible=False,
marker=dict(color='#f2a900') # set the marker color to gold
),
go.Bar(
x=generate_word_list(text_col = twitter_eth_df.text).word,
y=generate_word_list(text_col = twitter_eth_df.text).word_count,
name = 'ETH words on Twitter',
visible=False,
marker=dict(color='#4d4d4e') # set the marker color to silver
),
go.Bar(
x=generate_word_list(text_col = ccn_df.article).word,
y=generate_word_list(text_col = ccn_df.article).word_count,
name = 'Top words in Cryptocurrency News',
visible=False,
marker=dict(color='#f2a900') # set the marker color to gold
)
],
'layout':go.Layout(title = str(n) +' most common words currently used in Bitcoin/Ethereum discussions', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'BTC words on Reddit',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': str(n) + ' most common words currently used about Bitcoin on Reddit'}]),
dict(label = 'ETH words on Reddit',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': str(n) + ' most common words currently used about Ethereum on Reddit'}]),
dict(label = 'Both Reddit',
method = 'update',
args = [{'visible': [True, True, False, False, False]},
{'title': str(n)+ ' most common words currently used about Bitcoin and Ethereum on Reddit'}]),
dict(label = 'BTC words on Twitter',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': str(n) + ' most common words currently used about Bitcoin on Twitter'}]),
dict(label = 'ETH words on Twitter',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': str(n) + ' most common words currently used about Ethereum on Twitter'}]),
dict(label = 'Both Twitter',
method = 'update',
args = [{'visible': [False, False, True, True, False]},
{'title': str(n) + ' most common words currently used about Bitcoin and Ethereum on Twitter'}]),
dict(label = 'Cryptocurrency News',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': str(n) + ' most common words currently used about Bitcoin and Ethereum in the News'}])
]),
)
])
,
xaxis = {'title':'Word'},
yaxis = {'title':'Word count'}
)}
iplot(go.Figure(figure))
btc_reddit.head(2)
eth_reddit.head(2)
# Reddit
ts_btc_reddit = btc_reddit.set_index("created_utc", inplace=False)
ts_btc_reddit = ts_btc_reddit.SA_score.resample('D').mean()
ts_eth_reddit = eth_reddit.set_index("created_utc", inplace=False)
ts_eth_reddit = ts_eth_reddit.SA_score.resample('D').mean()
standardized_reddit_scores = pd.DataFrame({'BTC':ts_btc_reddit,'ETH':ts_eth_reddit})
# Since the server might be down on certain days, we need to ensure that our time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_reddit_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_reddit_scores['ETH'].interpolate(method='linear', inplace=True)
standardized_reddit_scores
twitter_btc_df.head(2)
twitter_eth_df.head(2)
ts_twitter_btc_df = twitter_btc_df.set_index("tweet_created", inplace=False)
ts_twitter_btc_df = ts_twitter_btc_df.SA_score.resample('D').mean()
ts_twitter_eth_df = twitter_eth_df.set_index("tweet_created", inplace=False)
ts_twitter_eth_df = ts_twitter_eth_df.SA_score.resample('D').mean()
standardized_twitter_scores = pd.DataFrame({'BTC':ts_twitter_btc_df, 'ETH':ts_twitter_eth_df})
# Since the server might be down on certain days, we need to ensure that time series has no discontinuities: interpolate() fills gaps of any size with a straight line
standardized_twitter_scores['BTC'].interpolate(method='linear', inplace=True)
standardized_twitter_scores['ETH'].interpolate(method='linear', inplace=True)
standardized_twitter_scores
ccn_df = ccn_df[ccn_df.date >= pd.to_datetime("2018-06-10")]
ccn_df.head(1)
ccn_df.tail(1)
ts_ccn = ccn_df.set_index("date", inplace=False)
ts_ccn = ts_ccn.SA_score.resample('D').mean()
standardized_ccn_scores = pd.DataFrame({'CCN':ts_ccn})
standardized_ccn_scores
len(standardized_reddit_scores)
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=20' # + str(len(standardized_reddit_scores))
# we want only last 5 days
# &toTs=1522224000
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
btc_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
btc_mini["timestamp"] = pd.to_datetime(btc_mini["time"], unit='s')
btc_mini
url = 'https://min-api.cryptocompare.com/data/histoday?fsym=ETH&tsym=USD&limit=20' # we want only last 5 days
# &toTs=1522224000
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
eth_mini = pd.DataFrame(json_data["Data"]) # dictionary of currency values is stored as a list under the key "Data"
eth_mini["timestamp"] = pd.to_datetime(eth_mini["time"], unit='s')
eth_mini
btc_mini.set_index("timestamp", inplace=True)
btc_mini.head(2)
eth_mini.set_index("timestamp", inplace=True)
eth_mini.head(2)
# Now we scale the "Mini" BTC/ETH values so that we can plot them together with sentiment on the same axis.
# we scale values to be between -1 and 1, i.e. on the same scale as the sentiment values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
btc_scaled = pd.DataFrame(scaler.fit_transform(btc_mini), columns=btc_mini.columns)
btc_scaled
eth_scaled = pd.DataFrame(scaler.fit_transform(eth_mini), columns=eth_mini.columns)
eth_scaled
figure = {'data':[
go.Scatter(
x = btc_mini.index,
y = btc_scaled.close,
name = "BTC in USD (scaled)",
visible=True,
marker=dict(color='#f2a900'),
mode = 'markers+lines'
),
go.Scatter(
x = eth_mini.index,
y = eth_scaled.close,
name = "ETH in USD (scaled)",
visible=True,
marker=dict(color='#4d4d4e'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_reddit_scores.index,
y = standardized_reddit_scores.BTC,
line = dict(color = '#f2a900', dash = 'dot'),
name = "BTC Sentiment on Reddit",
visible=True,
#marker=dict(color='green'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_reddit_scores.index,
y = standardized_reddit_scores.ETH,
line = dict(color = '#4d4d4e', dash = 'dot'),
name = "ETH Sentiment on Reddit",
visible=True,
#marker=dict(color='blue'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_ccn_scores.index,
y = standardized_ccn_scores.CCN,
line = dict(color = 'green', dash = 'dash'),
name = "BTC and ETH Sentiment in the News",
visible=True,
mode = 'markers+lines'
),
go.Scatter(
x = standardized_twitter_scores.index,
y = standardized_twitter_scores.BTC,
line = dict(color = 'blue', dash = 'solid'),
name = "BTC Sentiment on Twitter",
visible=True,
#marker=dict(color='green'),
mode = 'markers+lines'
),
go.Scatter(
x = standardized_twitter_scores.index,
y = standardized_twitter_scores.ETH,
line = dict(color = 'purple', dash = 'solid'),
name = "ETH Sentiment on Twitter",
visible=True,
#marker=dict(color='blue'),
mode = 'markers+lines'
)],
'layout':go.Layout(title = 'BTC and ETH values & sentiment', showlegend=True,
updatemenus = list([
dict(active=-1, buttons=list([
dict(label = 'BTC and ETH Values over time',
method = 'update',
args = [{'visible': [True, True, False, False, False, False, False]},
{'title': 'BTC and ETH values'}]),
dict(label = 'BTC and ETH Sentiment on Reddit',
method = 'update',
args = [{'visible': [False, False, True, True, False, False, False]},
{'title': 'BTC and ETH sentiment on Reddit'}]),
dict(label = 'News',
method = 'update',
args = [{'visible': [False, False, False, False, True, False, False]},
{'title': 'BTC and ETH sentiment in the News'}]),
dict(label = 'Twitter BTC & ETH',
method = 'update',
args = [{'visible': [False, False, False, False, False, True, True]},
{'title': 'BTC and ETH sentiment on Twitter'}]),
dict(label = 'Reset: show all',
method = 'update',
args = [{'visible': [True, True, True, True, True, True, True]},
{'title': 'BTC and ETH values & sentiment in the News and Social Media'}])
])
)
])
,
xaxis = dict(title = 'Time', range = [minDate, maxDate]),
yaxis = dict(title = 'Sentiment & Values over time')
)}
iplot(go.Figure(figure))
import plotly
#plotly.tools.set_credentials_file(username='saraputri', api_key='ryjnBkAlTcxF1bPqaNde')
import plotly.tools as tls
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
#add atm
#mining pools
#multiple marker
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import json
import requests
import pandas as pd
import cufflinks as cf
import pandas as pd
#get the store name that can support crytocurrency
mapbox_access_token = 'pk.eyJ1Ijoic2FyYXB1dHJpIiwiYSI6ImNqaTMzNDBuaTB2djgzdm9hZXlnMTl1cW4ifQ.8vsxIGidl6bUz-u_rK3YSQ'
#get the atm location from Coinatmfinder API
url_btc_atm = 'https://www.coinatmfinder.com/CoimATMs-API.php'
r_btc_atm = requests.get(url_btc_atm) # Decode the JSON data into a dictionary: json_data
json_data_btc_atm = r_btc_atm.json()
btc_atm_df = pd.DataFrame(json_data_btc_atm) # dictionary of currency values is stored as a list under the key "Data"
site_lat_btc_atm = btc_atm_df["lat"]
site_lon_btc_atm = btc_atm_df["lng"]
locations_name_btc_atm = btc_atm_df["location"]
buy = btc_atm_df["buy"]
sell = btc_atm_df["sell"]
address = btc_atm_df["address"]
currency = btc_atm_df["currency"]
#get the store name that support crytocurrency from Coinmap API
url_venues = 'https://coinmap.org/api/v1/venues/'
r_venues = requests.get(url_venues) # Decode the JSON data into a dictionary: json_data
json_data_venues = r_venues.json()
venues_df = pd.DataFrame(json_data_venues["venues"]) # dictionary of currency values is stored as a list under the key "Data"
site_lat = venues_df["lat"]
site_lon = venues_df["lon"]
locations_name = venues_df["name"]
category_name = venues_df["category"]
#get the Mining Pools location from CSV file
df = pd.read_csv('Miningpools.csv')
mp_site_lat = df.lat
mp_site_lon = df.lon
mp_locations_name = df.name
data = [
go.Scattermapbox(
lat=site_lat_btc_atm,
lon=site_lon_btc_atm,
mode='markers',
name='Atm name',
marker=dict(
size=8,
color='red',
opacity=0.7
),
text= locations_name_btc_atm + ', ' + address +'<br>'+ 'Currency: '+ currency + '<br>' +'Buy: '+ buy + ', Sell: ' + sell,
#hoverinfo = "text"
),
go.Scattermapbox(
lat=site_lat,
lon=site_lon,
mode='markers',
name='Venues',
marker=dict(
size=8,
color='rgb(14, 88, 199)',
#color='rgb(64,224,208)',
opacity=0.7
),
text='Store name: ' +locations_name+ '<br>' + 'Category: ' +category_name,
#hoverinfo="text"
),
go.Scattermapbox(
lat=mp_site_lat,
lon=mp_site_lon,
mode='markers',
name='Mining Pools',
marker=dict(
size=10,
color='rgb(64,224,208)',
opacity=0.7
),
text='Mining pool: ' + mp_locations_name
)
]
layout = go.Layout(
autosize=True,
hovermode='closest',
#height = 800,
#width = 900,
title = 'Map of Bitcoin Accepting Values and <br> Map of ATM for Cryptocurrency and Mining Pools',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=52.52,
lon=13.4050
),
pitch=0,
zoom=10
),
)
fig = dict(data=data, layout=layout)
iplot(go.Figure(fig))
#Reference:
#Inspired by https://www.dataquest.io/blog/python-json-tutorial/
#Inspired by https://plot.ly/python/scattermapbox/
#Inspired by http://docs.python-requests.org/en/latest/user/quickstart/
#Inspired by https://plot.ly/python/legend/
Map showing only the venues:
#get the store name that can support crytocurrency
mapbox_access_token = 'pk.eyJ1Ijoic2FyYXB1dHJpIiwiYSI6ImNqaTMzNDBuaTB2djgzdm9hZXlnMTl1cW4ifQ.8vsxIGidl6bUz-u_rK3YSQ'
url = 'https://coinmap.org/api/v1/venues/'
r = requests.get(url) # Decode the JSON data into a dictionary: json_data
json_data = r.json()
venues_df = pd.DataFrame(json_data["venues"]) # dictionary of currency values is stored as a list under the key "Data"
#btc_values_df["timestamp"] = pd.to_datetime(btc_values_df["time"], unit='s') # clean the date format: up to seconds, without miliseconds
#venues_df.info()
site_lat = venues_df["lat"]
site_lon = venues_df["lon"]
locations_name = venues_df["name"]
category_name = venues_df["category"]
#get the address data from google maps from latitude and longitude
data = [
go.Scattermapbox(
lat=site_lat,
lon=site_lon,
mode='markers',
marker=dict(
size=9
),
text='store name: ' +locations_name+ ', ' + 'category: ' +category_name,
hoverinfo="text"
)
]
layout = go.Layout(
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=52.52,
lon=13.4050
),
pitch=0,
zoom=10
),
)
fig = dict(data=data, layout=layout)
iplot(go.Figure(fig))
#______________________________________________________________________________
# Preprocessing for the NER = Named Entity Recognition
#______________________________________________________________________________
import pickle
with open('Pickle_dash/ccn_df_tagged.pkl', 'rb') as f:
ccn_df_tagged = pickle.load(f)
with open('Pickle_dash/df_agg_count_ccn.pkl', 'rb') as f:
df_agg_count_ccn = pickle.load(f)
with open('Pickle_dash/df_btc_reddit_org_agg.pkl', 'rb') as f:
df_btc_reddit_org_agg = pickle.load(f)
with open('Pickle_dash/df_btc_reddit_pep_agg.pkl', 'rb') as f:
df_btc_reddit_pep_agg = pickle.load(f)
with open('Pickle_dash/df_tweet_agg_sum.pkl', 'rb') as f:
df_tweet_agg_sum = pickle.load(f)
with open('Pickle_dash/df_tweet_btc_agg_sum.pkl', 'rb') as f:
df_tweet_btc_agg_sum = pickle.load(f)
with open('Pickle_dash/df_tweet_eth_agg_sum.pkl', 'rb') as f:
df_tweet_eth_agg_sum = pickle.load(f)
with open('Pickle_dash/df_tweet_pep_agg.pkl', 'rb') as f:
df_tweet_pep_agg = pickle.load(f)
with open('Pickle_dash/df_tweet_pep_btc_agg.pkl', 'rb') as f:
df_tweet_pep_btc_agg = pickle.load(f)
with open('Pickle_dash/df_tweet_pep_eth_agg.pkl', 'rb') as f:
df_tweet_pep_eth_agg = pickle.load(f)
with open('Pickle_dash/df_eth_reddit_org_agg.pkl', 'rb') as f:
df_eth_reddit_org_agg = pickle.load(f)
with open('Pickle_dash/df_eth_reddit_pep_agg.pkl', 'rb') as f:
df_eth_reddit_pep_agg = pickle.load(f)
figure = {'data':[
go.Bar(
x = df_tweet_agg_sum['Number'].head(15).sort_index(ascending=False),
y = df_tweet_agg_sum['Organization'].head(15).sort_index(ascending=False),
name = "Twitter (BTC & ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_btc_agg_sum['Number'].head(15).sort_index(ascending=False),
y = df_tweet_btc_agg_sum['Organization'].head(15).sort_index(ascending=False),
name = "Twitter (BTC)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_eth_agg_sum['Number'].head(15).sort_index(ascending=False),
y = df_tweet_eth_agg_sum['Organization'].head(15).sort_index(ascending=False),
name = "Twitter (ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_btc_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
y = df_btc_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
name = "Reddit (BTC)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
go.Bar(
x = df_eth_reddit_org_agg['Number'].head(5).sort_index(ascending=False),
y = df_eth_reddit_org_agg['Organization'].head(5).sort_index(ascending=False),
name = "Reddit (ETH)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
go.Bar(
x = df_agg_count_ccn['Sum'].head(15).sort_index(ascending=False),
y = df_agg_count_ccn['Organization'].head(15).sort_index(ascending=False),
name = "CCN news",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 118, 0)')
),
],
'layout':go.Layout(title = 'Organizations on Twitter, Reddit & CCN news', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'Twitter (BTC & ETH)',
method = 'update',
args = [{'visible': [True, False, False, False, False, False]},
{'title': 'TOP 15 organizations on Twitter (BTC & ETH)'}]),
dict(label = 'Twitter (BTC)',
method = 'update',
args = [{'visible': [False, True, False, False, False, False]},
{'title': 'TOP 15 organizations on Twitter (BTC)'}]),
dict(label = 'Twitter (ETH)',
method = 'update',
args = [{'visible': [False, False, True, False, False, False]},
{'title': 'TOP 15 organizations on Twitter (ETH)'}]),
dict(label = 'Reddit (BTC)',
method = 'update',
args = [{'visible': [False, False, False, True, False, False]},
{'title': 'TOP 5 organizations on Reddit (BTC)'}]),
dict(label = 'Reddit (ETH)',
method = 'update',
args = [{'visible': [False, False, False, False, True, False]},
{'title': 'TOP 5 organizations on Reddit (ETH)'}]),
dict(label = 'CCN news',
method = 'update',
args = [{'visible': [False, False, False, False, False, True]},
{'title': 'TOP 15 organizations on CCN news'}]),
dict(label = 'Reset: show all',
method = 'update',
args = [{'visible': [True, True, True, True, True, True]},
{'title': 'TOP 15 organizations on CCN news'}])
]),
direction = 'down',
pad = {'r': 10, 't': 10},
showactive = True,
x = 0,
xanchor = 'right',
y = 1.2,
yanchor = 'top'
)
])
,
xaxis = dict(title = 'Number of occurrences'),
yaxis = dict(title = ''),
margin=dict(
l=250,
r=20,
t=70,
b=70,
)
)}
iplot(go.Figure(figure))
figure = {'data':[
go.Bar(
x = df_tweet_pep_agg['Number'].head(15).sort_index(ascending=False),
y = df_tweet_pep_agg['Person'].head(15).sort_index(ascending=False),
name = "Twitter (BTC & ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_pep_btc_agg['Number'].head(15).sort_index(ascending=False),
y = df_tweet_pep_btc_agg['Person'].head(15).sort_index(ascending=False),
name = "Twitter (BTC)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_tweet_pep_eth_agg['Number'].head(15).sort_index(ascending=False),
y = df_tweet_pep_eth_agg['Person'].head(15).sort_index(ascending=False),
name = "Twitter (ETH)",
visible=True,
orientation = 'h'
),
go.Bar(
x = df_btc_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
y = df_btc_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
name = "Reddit (BTC)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
go.Bar(
x = df_eth_reddit_pep_agg['Number'].head(5).sort_index(ascending=False),
y = df_eth_reddit_pep_agg['Person'].head(5).sort_index(ascending=False),
name = "Reddit (ETH)",
visible=True,
orientation = 'h',
marker=dict(
color='rgb(231, 60, 0)')
),
],
'layout':go.Layout(title = 'TOP people discussed on Twitter & Reddit', showlegend=True,
updatemenus = list([
dict(active=-1,
buttons=list([
dict(label = 'Twitter (BTC & ETH)',
method = 'update',
args = [{'visible': [True, False, False, False, False]},
{'title': 'TOP 15 people on Twitter (BTC & ETH)'}]),
dict(label = 'Twitter (BTC)',
method = 'update',
args = [{'visible': [False, True, False, False, False]},
{'title': 'TOP 15 people on Twitter (BTC)'}]),
dict(label = 'Twitter (ETH)',
method = 'update',
args = [{'visible': [False, False, True, False, False]},
{'title': 'TOP 15 people on Twitter (ETH)'}]),
dict(label = 'Reddit (BTC)',
method = 'update',
args = [{'visible': [False, False, False, True, False]},
{'title': 'TOP 5 people on Reddit (BTC)'}]),
dict(label = 'Reddit (ETH)',
method = 'update',
args = [{'visible': [False, False, False, False, True]},
{'title': 'TOP 5 people on Reddit (ETH)'}]),
dict(label = 'Reset: show all',
method = 'update',
args = [{'visible': [True, True, True, True, True]},
{'title': 'TOP people on CCN news'}])
]),
direction = 'down',
pad = {'r': 10, 't': 10},
showactive = True,
x = 0,
xanchor = 'right',
y = 1.2,
yanchor = 'top'
)
])
,
xaxis = dict(title = 'Number of occurrences'),
yaxis = dict(title = ''),
margin=dict(
l=250,
r=20,
t=70,
b=70,
)
)}
iplot(go.Figure(figure))